libraries used.
library(dplyr)
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:plyr㤼㸲:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following object is masked from 㤼㸱package:gridExtra㤼㸲:
combine
The following objects are masked from 㤼㸱package:igraph㤼㸲:
as_data_frame, groups, union
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
Data Mining from Indeed Website
ht<-list(c())
n=seq(0,2835,20)
for(i in n){
y=read_html(paste("https://www.indeed.co.in/cmp/Northwestern-Mutual/reviews?fcountry=ALL&start=i"))%>% html_nodes(".cmp-review-text")%>%html_text(trim = TRUE)
ht<-append(ht,x)
}
#Data-Preprocessing: removing '\n'
FCA_html<-gsub("\n","",FCA_html)
#remove all round brackets
FCA_html<-FCA_html%>%str_replace_all("\\(|\\)", "")
#remove all \\
FCA_html<-FCA_html%>%str_replace_all("\\\\", "")
#remove all non words and non numbers
#FCA_html<-FCA_html%>%str_replace_all("[^A-Za-z0-9]", "")
#remove all •
FCA_html<-FCA_html%>%str_replace_all("\\• ", "")
#remove all &
FCA_html<-FCA_html%>%str_replace_all("\\ & ", "")
#remove all non printable words
FCA_html<-FCA_html%>%str_replace_all("[^[:print:]]", "")
#remove all \
FCA_html<-FCA_html%>%str_replace_all(pattern = "\"", replacement = "")
#FCAindeed2<-FCAindeed2%>%stringi::stri_unescape_unicode()
# remove digits
#FCA_html%>%str_replace_all(pattern = "[[:digit:]]+", replacement = "")
#tm::removeNumbers(FCA_html)
#### pattern for dates
pattern ="\\(?\\d{4}\\)?[.-]? *\\d{2}[.-]? *[.-]?\\d{2}"
date=FCA_html%>%str_extract_all(pattern)
#FCA_html[[1]]%>%str_subset(pattern = "([0-9]{1,2})[- .]([a-zA-Z]+)[- .]([0-9]{4})")
#FCA_html[[1]]
#unlist(Date)
#FCA_html_2=data_frame(Date=as.Date(unlist(date)),FCA_html)
indeed <- tibble(text = c(FCA_html))
#convert all text to lower case
df_lower<- tolower(indeed$text)
# Replace blank space (“rt”)
df_lower <- gsub("rt", "", df_lower)
# Replace @UserName
df_lower <- gsub("@\\w+", "", df_lower)
# Remove punctuation
df_lower <- gsub("[[:punct:]]", "", df_lower)
# Remove links
df_lower <- gsub("http\\w+", "", df_lower)
# Remove tabs
df_lower<- gsub("[ |\t]{2,}", "", df_lower)
# Remove blank spaces at the beginning
df_lower <- gsub("^ ", "", df_lower)
# Remove blank spaces at the end
df_lower <- gsub(" $", "",df_lower )
#clean up by removing stop words
corpus<-Corpus(VectorSource(df_lower))
corpus <- tm_map(corpus, function(x)removeWords(x,stopwords()))
transformation drops documents
#generate wordcloud
wordcloud(corpus,min.freq = 10,colors=brewer.pal(8, "Dark2"),random.color = TRUE,max.words = 500)

#getting emotions using in-built function
mysentiment<-get_nrc_sentiment((df_lower))
#calculationg total score for each sentiment
Sentimentscores<-data.frame(colSums(mysentiment[,]))
names(Sentimentscores)<-"Score"
Sentimentscores<-cbind("sentiment"=rownames(Sentimentscores),Sentimentscores)
rownames(Sentimentscores)<-NULL
#plotting the sentiments with scores
ggplotly(ggplot(data=Sentimentscores,aes(x=reorder(sentiment,Score),y=Score))+geom_bar(aes(fill=sentiment),stat = "identity")+
theme(legend.position="none")+
xlab("Sentiments")+ylab("scores")+ggtitle("Sentiments of Employees behind the comments or rating ")+labs(title = "Sentiments of people behind the tweets on #MachineLearning",x="Sentiments", y = "Scores")+theme(plot.title = element_text(hjust = -10,face = "bold",color = "black"))+coord_flip())
Finding the words count in the text data.
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
d<- d[with(d,order(-freq)),]
head(d, 20)
showing the top 10 high frequency words used in the text.
ggplotly(ggplot(d[1:10,],aes(x=reorder(word,freq),y=freq,fill=as.factor(word)))+geom_bar(stat = "identity")+ geom_text(aes(label=freq), vjust=10)+labs(title = "The Top 10 frequently used words",x="Words", y = "Frequency of Words",fill="Words")+theme(plot.title = element_text(hjust = -10,face = "bold",color = "black"))+coord_flip())
library(data.table)
d<-as.data.table(d)
p <- d[1:10,] %>%
group_by(word) %>%
plot_ly(labels = ~word, values = ~d[1:10,freq]) %>%
add_pie(hole = 0.6) %>%
layout(title = "The Percentage of top 10 frequently occuring words.", showlegend = F,
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
LS0tDQp0aXRsZTogIkluZGVlZCBFbXBsb3llZSBSZXZpZXcgZGF0YSBhbmFseXNpcyBvZiBjb21wYW55IE5vcnRod2VzdGVybiBNdXR1YWwgIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCmxpYnJhcmllcyB1c2VkLg0KYGBge3J9DQojaW5zdGFsbC5wYWNrYWdlcygicGFjbWFuIikNCmxpYnJhcnkocGFjbWFuKQ0KbGlicmFyeSh2aXJpZGlzKQ0KbGlicmFyeShnZ3JpZGdlcykNCmxpYnJhcnkoaWdyYXBoKQ0KbGlicmFyeShoaWdoY2hhcnRlcikNCiNpbnN0YWxsLnBhY2thZ2VzKCJnZ3JhcGgiKQ0KbGlicmFyeShnZ3JhcGgpDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KHBsb3RseSkNCiNpbnN0YWxsLnBhY2thZ2VzKCJ0bSIpDQpsaWJyYXJ5KHRtKQ0KI2luc3RhbGwucGFja2FnZXMoInRpZHlxdWFudCIpDQojbGlicmFyeSh0aWR5cXVhbnQpDQojaW5zdGFsbC5wYWNrYWdlcyh0aWR5dGV4dCkNCmxpYnJhcnkodGlkeXRleHQpDQojaW5zdGFsbC5wYWNrYWdlcygiUlRleHRUb29scyIpDQojaW5zdGFsbC5wYWNrYWdlcygiUlRleHRUb29scyIsIHJlcG9zID0gImh0dHA6Ly9yLmZpbmRhdGEub3JnIikNCiNpbnN0YWxsLnBhY2thZ2VzKCJSVGV4dFRvb2xzIiwgZGVwZW5kZW5jaWVzPVRSVUUsIHJlcG9zPSdodHRwOi8vY3Jhbi5yc3R1ZGlvLmNvbS8nKQ0KI2luc3RhbGwucGFja2FnZXMoJ1JUZXh0VG9vbHMnLHJlcG9zPSdodHRwOi8vY3Jhbi51cy5yLXByb2plY3Qub3JnJykNCmxpYnJhcnkoU25vd2JhbGxDKQ0KbGlicmFyeSh0b3BpY21vZGVscykNCmxpYnJhcnkoZ2d0aGVtZXMpDQojaW5zdGFsbC5wYWNrYWdlcygid2FmZiIpDQojbGlicmFyeSh3YWZmKQ0KbGlicmFyeShncmlkKQ0KbGlicmFyeShncmlkRXh0cmEpDQojaW5zdGFsbC5wYWNrYWdlcygidGV4dHJlYWRyIikNCmxpYnJhcnkodGV4dHJlYWRyKQ0KI2luc3RhbGwucGFja2FnZXMoInhtbDIiLCdodHRwczovL2NyYW4ucnN0dWRpby5jb20vYmluL3dpbmRvd3MvY29udHJpYi8zLjUveG1sMl8xLjIuMC56aXAnKQ0KbGlicmFyeShwdXJycikNCmxpYnJhcnkoeG1sMikNCmxpYnJhcnkoc3RyaW5ncikNCiNpbnN0YWxsLnBhY2thZ2VzKCJydmVzdCIpDQpsaWJyYXJ5KHJ2ZXN0KQ0KbGlicmFyeShwbHlyKQ0KbGlicmFyeSh0aWJibGUpDQpsaWJyYXJ5KHdvcmRjbG91ZCkNCmxpYnJhcnkoc3l1emhldCkNCmxpYnJhcnkoZHBseXIpDQpgYGANCg0KRGF0YSBNaW5pbmcgZnJvbSBJbmRlZWQgV2Vic2l0ZQ0KYGBge3J9DQpodDwtbGlzdChjKCkpDQpuPXNlcSgwLDI4MzUsMjApDQpmb3IoaSBpbiBuKXsNCnk9cmVhZF9odG1sKHBhc3RlKCJodHRwczovL3d3dy5pbmRlZWQuY28uaW4vY21wL05vcnRod2VzdGVybi1NdXR1YWwvcmV2aWV3cz9mY291bnRyeT1BTEwmc3RhcnQ9aSIpKSU+JSBodG1sX25vZGVzKCIuY21wLXJldmlldy10ZXh0IiklPiVodG1sX3RleHQodHJpbSA9IFRSVUUpDQpodDwtYXBwZW5kKGh0LHgpDQpGQ0FfaHRtbDwtaHQNCn0NCkZDQV9odG1sPC1odA0KYGBgDQoNCg0KYGBge3J9DQojRGF0YS1QcmVwcm9jZXNzaW5nOiByZW1vdmluZyAnXG4nDQpGQ0FfaHRtbDwtZ3N1YigiXG4iLCIiLEZDQV9odG1sKQ0KI3JlbW92ZSBhbGwgcm91bmQgYnJhY2tldHMNCkZDQV9odG1sPC1GQ0FfaHRtbCU+JXN0cl9yZXBsYWNlX2FsbCgiXFwofFxcKSIsICIiKQ0KI3JlbW92ZSBhbGwgXFwNCkZDQV9odG1sPC1GQ0FfaHRtbCU+JXN0cl9yZXBsYWNlX2FsbCgiXFxcXCIsICIiKQ0KI3JlbW92ZSBhbGwgbm9uIHdvcmRzIGFuZCBub24gbnVtYmVycw0KI0ZDQV9odG1sPC1GQ0FfaHRtbCU+JXN0cl9yZXBsYWNlX2FsbCgiW15BLVphLXowLTldIiwgIiIpDQojcmVtb3ZlIGFsbCDigKIgDQpGQ0FfaHRtbDwtRkNBX2h0bWwlPiVzdHJfcmVwbGFjZV9hbGwoIlxc4oCiICAiLCAiIikNCiNyZW1vdmUgYWxsICYgDQpGQ0FfaHRtbDwtRkNBX2h0bWwlPiVzdHJfcmVwbGFjZV9hbGwoIlxcICYgIiwgIiIpDQojcmVtb3ZlIGFsbCAgbm9uIHByaW50YWJsZSB3b3Jkcw0KRkNBX2h0bWw8LUZDQV9odG1sJT4lc3RyX3JlcGxhY2VfYWxsKCJbXls6cHJpbnQ6XV0iLCAiIikNCiNyZW1vdmUgYWxsIFwNCkZDQV9odG1sPC1GQ0FfaHRtbCU+JXN0cl9yZXBsYWNlX2FsbChwYXR0ZXJuID0gIlwiIiwgcmVwbGFjZW1lbnQgPSAiIikNCiNGQ0FpbmRlZWQyPC1GQ0FpbmRlZWQyJT4lc3RyaW5naTo6c3RyaV91bmVzY2FwZV91bmljb2RlKCkNCiMgcmVtb3ZlIGRpZ2l0cw0KI0ZDQV9odG1sJT4lc3RyX3JlcGxhY2VfYWxsKHBhdHRlcm4gPSAiW1s6ZGlnaXQ6XV0rIiwgcmVwbGFjZW1lbnQgPSAiIikNCiN0bTo6cmVtb3ZlTnVtYmVycyhGQ0FfaHRtbCkNCiMjIyMgcGF0dGVybiBmb3IgZGF0ZXMNCnBhdHRlcm4gPSJcXCg/XFxkezR9XFwpP1suLV0/ICpcXGR7Mn1bLi1dPyAqWy4tXT9cXGR7Mn0iDQpkYXRlPUZDQV9odG1sJT4lc3RyX2V4dHJhY3RfYWxsKHBhdHRlcm4pDQojRkNBX2h0bWxbWzFdXSU+JXN0cl9zdWJzZXQocGF0dGVybiA9ICIoWzAtOV17MSwyfSlbLSAuXShbYS16QS1aXSspWy0gLl0oWzAtOV17NH0pIikNCiNGQ0FfaHRtbFtbMV1dDQojdW5saXN0KERhdGUpDQojRkNBX2h0bWxfMj1kYXRhX2ZyYW1lKERhdGU9YXMuRGF0ZSh1bmxpc3QoZGF0ZSkpLEZDQV9odG1sKQ0KDQpgYGANCg0KDQpgYGB7cn0NCmdldF9zZW50aW1lbnRzKGxleGljb24gPSAibnJjIikNCmBgYA0KDQpgYGB7cn0NCmluZGVlZCA8LSB0aWJibGUodGV4dCA9IGMoRkNBX2h0bWwpKQ0KYGBgDQoNCmBgYHtyfQ0KI2NvbnZlcnQgYWxsIHRleHQgdG8gbG93ZXIgY2FzZQ0KZGZfbG93ZXI8LSB0b2xvd2VyKGluZGVlZCR0ZXh0KQ0KDQojIFJlcGxhY2UgYmxhbmsgc3BhY2UgKOKAnHJ04oCdKQ0KZGZfbG93ZXIgPC0gZ3N1YigicnQiLCAiIiwgZGZfbG93ZXIpDQoNCiMgUmVwbGFjZSBAVXNlck5hbWUNCmRmX2xvd2VyIDwtIGdzdWIoIkBcXHcrIiwgIiIsIGRmX2xvd2VyKQ0KDQojIFJlbW92ZSBwdW5jdHVhdGlvbg0KZGZfbG93ZXIgPC0gZ3N1YigiW1s6cHVuY3Q6XV0iLCAiIiwgZGZfbG93ZXIpDQoNCiMgUmVtb3ZlIGxpbmtzDQpkZl9sb3dlciA8LSBnc3ViKCJodHRwXFx3KyIsICIiLCBkZl9sb3dlcikNCg0KIyBSZW1vdmUgdGFicw0KZGZfbG93ZXI8LSBnc3ViKCJbIHxcdF17Mix9IiwgIiIsIGRmX2xvd2VyKQ0KDQojIFJlbW92ZSBibGFuayBzcGFjZXMgYXQgdGhlIGJlZ2lubmluZw0KZGZfbG93ZXIgPC0gZ3N1YigiXiAiLCAiIiwgZGZfbG93ZXIpDQoNCiMgUmVtb3ZlIGJsYW5rIHNwYWNlcyBhdCB0aGUgZW5kDQpkZl9sb3dlciA8LSBnc3ViKCIgJCIsICIiLGRmX2xvd2VyICkNCmBgYA0KDQpgYGB7cn0NCiNjbGVhbiB1cCBieSByZW1vdmluZyBzdG9wIHdvcmRzDQpjb3JwdXM8LUNvcnB1cyhWZWN0b3JTb3VyY2UoZGZfbG93ZXIpKQ0KY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIGZ1bmN0aW9uKHgpcmVtb3ZlV29yZHMoeCxzdG9wd29yZHMoKSkpDQpgYGANCg0KYGBge3J9DQojZ2VuZXJhdGUgd29yZGNsb3VkDQp3b3JkY2xvdWQoY29ycHVzLG1pbi5mcmVxID0gMTAsY29sb3JzPWJyZXdlci5wYWwoOCwgIkRhcmsyIikscmFuZG9tLmNvbG9yID0gVFJVRSxtYXgud29yZHMgPSA1MDApDQpgYGANCmBgYHtyfQ0KI2dldHRpbmcgZW1vdGlvbnMgdXNpbmcgaW4tYnVpbHQgZnVuY3Rpb24NCm15c2VudGltZW50PC1nZXRfbnJjX3NlbnRpbWVudCgoZGZfbG93ZXIpKQ0KYGBgDQoNCg0KYGBge3J9DQojY2FsY3VsYXRpb25nIHRvdGFsIHNjb3JlIGZvciBlYWNoIHNlbnRpbWVudA0KU2VudGltZW50c2NvcmVzPC1kYXRhLmZyYW1lKGNvbFN1bXMobXlzZW50aW1lbnRbLF0pKQ0KbmFtZXMoU2VudGltZW50c2NvcmVzKTwtIlNjb3JlIg0KU2VudGltZW50c2NvcmVzPC1jYmluZCgic2VudGltZW50Ij1yb3duYW1lcyhTZW50aW1lbnRzY29yZXMpLFNlbnRpbWVudHNjb3JlcykNCnJvd25hbWVzKFNlbnRpbWVudHNjb3Jlcyk8LU5VTEwNCmBgYA0KDQpgYGB7cn0NCiNwbG90dGluZyB0aGUgc2VudGltZW50cyB3aXRoIHNjb3Jlcw0KZ2dwbG90bHkoZ2dwbG90KGRhdGE9U2VudGltZW50c2NvcmVzLGFlcyh4PXJlb3JkZXIoc2VudGltZW50LFNjb3JlKSx5PVNjb3JlKSkrZ2VvbV9iYXIoYWVzKGZpbGw9c2VudGltZW50KSxzdGF0ID0gImlkZW50aXR5IikrDQogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbj0ibm9uZSIpKw0KICB4bGFiKCJTZW50aW1lbnRzIikreWxhYigic2NvcmVzIikrZ2d0aXRsZSgiU2VudGltZW50cyBvZiBFbXBsb3llZXMgYmVoaW5kIHRoZSBjb21tZW50cyBvciByYXRpbmcgIikrbGFicyh0aXRsZSA9ICJTZW50aW1lbnRzIG9mIHBlb3BsZSBiZWhpbmQgdGhlIHR3ZWV0cyBvbiAjTWFjaGluZUxlYXJuaW5nIix4PSJTZW50aW1lbnRzIiwgeSA9ICJTY29yZXMiKSt0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGhqdXN0ID0gLTEwLGZhY2UgPSAiYm9sZCIsY29sb3IgPSAiYmxhY2siKSkrY29vcmRfZmxpcCgpKQ0KYGBgDQpGaW5kaW5nIHRoZSB3b3JkcyBjb3VudCBpbiB0aGUgdGV4dCBkYXRhLg0KYGBge3J9DQpkdG0gPC0gVGVybURvY3VtZW50TWF0cml4KGNvcnB1cykNCm0gPC0gYXMubWF0cml4KGR0bSkNCnYgPC0gc29ydChyb3dTdW1zKG0pLGRlY3JlYXNpbmc9VFJVRSkNCmQgPC0gZGF0YS5mcmFtZSh3b3JkID0gbmFtZXModiksZnJlcT12KQ0KZDwtIGRbd2l0aChkLG9yZGVyKC1mcmVxKSksXQ0KaGVhZChkLCAyMCkNCmBgYA0KDQpzaG93aW5nIHRoZSB0b3AgMTAgaGlnaCBmcmVxdWVuY3kgd29yZHMgIHVzZWQgaW4gdGhlIHRleHQuDQpgYGB7cn0NCmdncGxvdGx5KGdncGxvdChkWzE6MTAsXSxhZXMoeD1yZW9yZGVyKHdvcmQsZnJlcSkseT1mcmVxLGZpbGw9YXMuZmFjdG9yKHdvcmQpKSkrZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpKyBnZW9tX3RleHQoYWVzKGxhYmVsPWZyZXEpLCB2anVzdD0xMCkrbGFicyh0aXRsZSA9ICJUaGUgVG9wIDEwIGZyZXF1ZW50bHkgdXNlZCB3b3JkcyIseD0iV29yZHMiLCB5ID0gIkZyZXF1ZW5jeSBvZiBXb3JkcyIsZmlsbD0iV29yZHMiKSt0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGhqdXN0ID0gLTEwLGZhY2UgPSAiYm9sZCIsY29sb3IgPSAiYmxhY2siKSkrY29vcmRfZmxpcCgpKQ0KYGBgDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGRhdGEudGFibGUpDQpkPC1hcy5kYXRhLnRhYmxlKGQpDQpwIDwtIGRbMToxMCxdICU+JQ0KIGdyb3VwX2J5KHdvcmQpICU+JQ0KIHBsb3RfbHkobGFiZWxzID0gfndvcmQsIHZhbHVlcyA9IH5kWzE6MTAsZnJlcV0pICU+JQ0KIGFkZF9waWUoaG9sZSA9IDAuNikgJT4lDQogbGF5b3V0KHRpdGxlID0gIlRoZSBQZXJjZW50YWdlIG9mIHRvcCAxMCBmcmVxdWVudGx5IG9jY3VyaW5nIHdvcmRzLiIsICBzaG93bGVnZW5kID0gRiwNCiAgICAgICAgIHlheGlzID0gbGlzdChzaG93Z3JpZCA9IEZBTFNFLCB6ZXJvbGluZSA9IEZBTFNFLCBzaG93dGlja2xhYmVscyA9IEZBTFNFKSkNCnANCmBgYA0KDQoNCg0KDQoNCg==